In [107]:
import pandas as pd
json_file = 'sample_data'
list(pd.read_json(json_file, lines=True))
Out[107]:
In [ ]:
import csv
import json
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
MIN_NUM_WORD_TOKENS = 10
TOTAL_NUM_LINES = 53851542 # $ wc -l data_full.json
PBAR_UPDATE_SIZE = 10000
tokenizer = TweetTokenizer()
def _ok_to_write(entries):
if entries['author'] == '[deleted]':
return False
if entries['body'] == '[deleted]' or len(tokenizer.tokenize(entries['body'])) < MIN_NUM_WORD_TOKENS:
return False
return True
out_columns = [
'author',
'body',
'subreddit',
'subreddit_id',
'score',
]
in_filename = 'data_full.json'
out_filename = 'data_full_preprocessed.csv'
count = 0
pbar = tqdm(total=TOTAL_NUM_LINES)
with open(out_filename, 'w') as o:
writer = csv.DictWriter(o, fieldnames=out_columns, extrasaction='ignore',
delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
with open(in_filename, 'r') as f:
for line in f:
count += 1
if count % PBAR_UPDATE_SIZE == 0:
pbar.update(PBAR_UPDATE_SIZE)
entries = json.loads(line)
if _ok_to_write(entries):
writer.writerow(entries)
print('Done. Processed {} lines total.'.format(count))
In [15]:
import pandas as pd
from tqdm import tqdm
from nltk.corpus import wordnet
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
wordnet_lemmatizer = WordNetLemmatizer()
# Create synonym sets for obesity and anorexia
def syn_set(word_list):
syns = set()
for word in word_list:
for synset in wordnet.synsets(word):
for lemma in synset.lemmas():
syns.add(lemma.name())
return syns
OBESITY_SYNS = syn_set(['obesity'])
ANOREXIA_SYNS = syn_set(['anorexia'])
def row_filter_fn(df, syns):
"""Returns True if the row should be included, False otherwise."""
# Check if any synonyms can be found.
if set([wordnet_lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(df)]) & syns:
return True
return False
csv_filename = 'data_full_preprocessed.csv'
chunksize = 10000
count = 0
obesity_data_frames = []
anorexia_data_frames = []
for chunk in tqdm(pd.read_csv(csv_filename, chunksize=chunksize)):
obesity_df = chunk[chunk['body'].apply(row_filter_fn, syns=OBESITY_SYNS)]
if not obesity_df.empty:
obesity_data_frames.append(obesity_df)
anorexia_df = chunk[chunk['body'].apply(row_filter_fn, syns=ANOREXIA_SYNS)]
if not anorexia_df.empty:
anorexia_data_frames.append(anorexia_df)
count += 1
#if count == 100: break
print('Total # chunks processed: {}.'.format(count))
# Write out to CSVs.
pd.concat(obesity_data_frames).to_csv('obesity.csv', index=False)
pd.concat(anorexia_data_frames).to_csv('anorexia.csv', index=False)